Air pollution is a pressing environmental issue, with various types of pollutants affecting air quality. Among these, fine particulate matter (PM2.5)—particles smaller than 2.5 µm in diameter—is particularly harmful 1. Exposure to PM2.5 is linked to severe health problems, and some regions in the United States experience pollution levels that exceed the World Health Organization’s recommended limits 2 3.
Accurately predicting annual average air pollution concentrations in the U.S. has significant benefits, such as informing public health initiatives and guiding policy decisions. While traditional air pollution measurement methods provide valuable data, their uneven distribution nationwide and limited coverage of PM2.5 levels create gaps in understanding 4. To address this problem, we use machine learning to develop a model aimed at improving the accuracy of air pollution predictions. This model also incorporates climate region as a factor to account for geographic variability, seeking to enhance prediction accuracy, especially in regions with sparse monitor coverage.
Our data comes from the US Environmental Protection Agency (EPA), the National Aeronautics and Space Administration (NASA), the US Census, and the National Center for Health Statistics (NCHS).
It contains 48 features (variables) with values for each of the 876 monitors (observations).
climate_regions <- c(
"Connecticut" = "Northeast",
"Delaware" = "Northeast",
"District Of Columbia" = "Northeast",
"Maine" = "Northeast",
"Maryland" = "Northeast",
"Massachusetts" = "Northeast",
"New Hampshire" = "Northeast",
"New Jersey" = "Northeast",
"New York" = "Northeast",
"Pennsylvania" = "Northeast",
"Rhode Island" = "Northeast",
"Vermont" = "Northeast",
"Iowa" = "Upper Midwest",
"Michigan" = "Upper Midwest",
"Minnesota" = "Upper Midwest",
"Wisconsin" = "Upper Midwest",
"Illinois" = "Ohio Valley",
"Indiana" = "Ohio Valley",
"Kentucky" = "Ohio Valley",
"Missouri" = "Ohio Valley",
"Ohio" = "Ohio Valley",
"Tennessee" = "Ohio Valley",
"West Virginia" = "Ohio Valley",
"Alabama" = "Southeast",
"Florida" = "Southeast",
"Georgia" = "Southeast",
"North Carolina" = "Southeast",
"South Carolina" = "Southeast",
"Virginia" = "Southeast",
"Montana" = "Northern Rockies and Plains",
"Nebraska" = "Northern Rockies and Plains",
"North Dakota" = "Northern Rockies and Plains",
"South Dakota" = "Northern Rockies and Plains",
"Wyoming" = "Northern Rockies and Plains",
"Arkansas" = "South",
"Kansas" = "South",
"Louisiana" = "South",
"Mississippi" = "South",
"Oklahoma" = "South",
"Texas" = "South",
"Arizona" = "Southwest",
"Colorado" = "Southwest",
"New Mexico" = "Southwest",
"Utah" = "Southwest",
"Idaho" = "Northwest",
"Oregon" = "Northwest",
"Washington" = "Northwest",
"California" = "West",
"Nevada" = "West"
)
# Add a new column with region labels
pm_clim <- pm |>
mutate(climate_region = recode(state, !!!climate_regions))pm_clim %>%
ggplot(aes(x = climate_region, y = value, fill = climate_region)) +
geom_boxplot() +
theme_classic() +
labs(title = "Distribution of PM2.5 Concentrations by Climate Region",
x = "Climate Region",
y = "Value") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Vivian's EDA
# Creating a new column based on the placement of the monitor relative to the 40 degree latitude line
pm_quad <- pm |>
mutate(n_or_s = case_when(
lat >= 40 ~ "north",
lat < 40 ~ "south"
))
pm_quad %>%
ggplot(aes(y = CMAQ, x = n_or_s)) +
geom_boxplot()# Creating a new column based on the placement of the monitor relative to the 100 degree longitude line
pm_quad <- pm_quad |>
mutate(e_or_w = case_when(
lon >= -100 ~ "east",
lon < -100 ~ "west"
))
pm_quad %>%
ggplot(aes(y = CMAQ, x = e_or_w)) +
geom_boxplot()# Creating a new column based on the quadrant of the US the monitor is in
pm_quad <- pm_quad |>
mutate(quadrant = case_when(
lon >= -100 & lat >= 40 ~ "northeast",
lon >= -100 & lat < 40 ~ "southeast",
lon < -100 & lat >= 40 ~ "northwest",
lon < -100 & lat < 40 ~ "southwest"
))
pm_quad %>%
ggplot(aes(y = CMAQ, x = quadrant)) +
geom_boxplot()plot_popdens_zcta <- {
pm_long |>
ggplot(aes(x = popdens_zcta, y = value)) +
geom_point(color = "#19831C") +
facet_wrap(~name, scales = "free") +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)), limits = c(0, NA)) +
theme_classic() +
labs(title = "Air Pollution Metrics by Zip Code Population Density",
x = "popdens_zcta") +
theme(strip.background = element_blank(),
plot.title.position = "plot")
}
plot_popdens_zctaplot_popdens_county <- {
pm_long |>
ggplot(aes(x = popdens_county, y = value)) +
geom_point(color = "#88231C") +
facet_wrap(~name, scales = "free") +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)), limits = c(0, NA)) +
theme_classic() +
labs(title = "Air Pollution Metrics by County Population Density",
x = "popdens_county") +
theme(strip.background = element_blank(),
plot.title.position = "plot")
}
plot_popdens_countyplot_popdens_zcta <- {
pm_long |>
filter(popdens_zcta <= 5000) |> # Filter for popdens_zcta <= 5000
ggplot(aes(x = popdens_zcta, y = value)) +
geom_point(color = "#19831C") +
facet_wrap(~name, scales = "free") +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)), limits = c(0, NA)) +
theme_classic() +
labs(title = "Air Pollution Metrics by Zip Code Population Density",
x = "popdens_zcta") +
theme(strip.background = element_blank(),
plot.title.position = "plot")
}
plot_popdens_zctaplot_popdens_county <- {
pm_long |>
filter(popdens_county <= 5000) |> # Filter for popdens_county <= 5000
ggplot(aes(x = popdens_county, y = value)) +
geom_point(color = "#88231C") +
facet_wrap(~name, scales = "free") +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)), limits = c(0, NA)) +
theme_classic() +
labs(title = "Air Pollution Metrics by County Population Density",
x = "popdens_county") +
theme(strip.background = element_blank(),
plot.title.position = "plot")
}
plot_popdens_county# Maddy's EDA
pm_cl <- pm |>
mutate(pop_density_cluster = cut(popdens_county, breaks = 3, labels = c("Low", "Medium", "High")))
ggplot(pm_cl, aes(x = pop_density_cluster, y = value)) +
geom_boxplot(fill = "pink") +
labs(title = "PM2.5 Levels by Population Density Clusters",
x = "Population Density Cluster",
y = "PM2.5 Levels")ggplot(pm_cl, aes(x = pop_density_cluster, y = value)) +
geom_boxplot(fill = "green") + facet_wrap(~state) +
labs(title = "PM2.5 Levels by Population Density Clusters and State",
x = "Population Density Cluster",
y = "PM2.5 Levels")#Sean's EDA
#check development correlation
select(pm, contains("imp")) |>
GGally::ggcorr(hjust = .85, size = 3,
layout.exp=2, label = TRUE)#want to compare education vs. development but was negative
pm |>
select(nohs, popdens_county,
hs, imp_a10000, somecollege) |>
GGally::ggpairs()#population density and emission are correlated
pm |>
select(log_nei_2008_pm25_sum_10000, popdens_county) |>
GGally::ggpairs()# Scatterplot for Urbanicity (urc2006) vs Emissions (log_nei_2008_pm25_sum_10000) for 2006
ggplot(pm, aes(x = urc2006, y = log_nei_2008_pm25_sum_10000)) +
geom_point(aes(color = log_nei_2008_pm25_sum_10000), alpha = 0.7) +
theme_minimal()# Scatterplot for Urbanicity (urc2006) vs Emissions (log_nei_2008_pm25_sum_15000) for 2006
ggplot(pm, aes(x = urc2006, y = log_nei_2008_pm25_sum_15000)) +
geom_point(aes(color = log_nei_2008_pm25_sum_15000), alpha = 0.7) +
theme_minimal()# Scatterplot for Urbanicity (urc2006) vs Emissions (log_nei_2008_pm25_sum_25000) for 2006
ggplot(pm, aes(x = urc2006, y = log_nei_2008_pm25_sum_25000)) +
geom_point(aes(color = log_nei_2008_pm25_sum_25000), alpha = 0.7) +
theme_minimal()# Scatterplot for Urbanicity (urc2013) vs Emissions (log_nei_2008_pm25_sum_10000) for 2013
ggplot(pm, aes(x = urc2013, y = log_nei_2008_pm25_sum_10000)) +
geom_point(aes(color = log_nei_2008_pm25_sum_10000), alpha = 0.7) +
theme_minimal()# Scatterplot for Urbanicity (urc2013) vs Emissions (log_nei_2008_pm25_sum_15000) for 2013
ggplot(pm, aes(x = urc2013, y = log_nei_2008_pm25_sum_15000)) +
geom_point(aes(color = log_nei_2008_pm25_sum_15000), alpha = 0.7) +
theme_minimal()# Scatterplot for Urbanicity (urc2013) vs Emissions (log_nei_2008_pm25_sum_25000) for 2013
ggplot(pm, aes(x = urc2013, y = log_nei_2008_pm25_sum_25000)) +
geom_point(aes(color = log_nei_2008_pm25_sum_25000), alpha = 0.7) +
theme_minimal()EDA Visualizations
# pri
select(pm, contains("pri")) |>
GGally::ggcorr(hjust = .85, size = 3,
layout.exp=2, label = TRUE)# nei with population density
pm |>
select(log_nei_2008_pm25_sum_10000, popdens_county,
log_pri_length_10000, imp_a10000, county_pop) |>
GGally::ggpairs()PM model without climate regions
pm <- pm |>
mutate(city = case_when(city == "Not in a city" ~ "Not in a city",
city != "Not in a city" ~ "In a city"))
pm## # A tibble: 876 × 50
## id value fips lat lon state county city CMAQ zcta zcta_area
## <fct> <dbl> <fct> <dbl> <dbl> <chr> <chr> <chr> <dbl> <fct> <dbl>
## 1 1003.001 9.60 1003 30.5 -87.9 Alabama Baldwin In a… 8.10 36532 190980522
## 2 1027.0001 10.8 1027 33.3 -85.8 Alabama Clay In a… 9.77 36251 374132430
## 3 1033.1002 11.2 1033 34.8 -87.7 Alabama Colbert In a… 9.40 35660 16716984
## 4 1049.1003 11.7 1049 34.3 -86.0 Alabama DeKalb In a… 8.53 35962 203836235
## 5 1055.001 12.4 1055 34.0 -86.0 Alabama Etowah In a… 9.24 35901 154069359
## 6 1069.0003 10.5 1069 31.2 -85.4 Alabama Houston In a… 9.12 36303 162685124
## 7 1073.0023 15.6 1073 33.6 -86.8 Alabama Jeffer… In a… 10.2 35207 26929603
## 8 1073.1005 12.4 1073 33.3 -87.0 Alabama Jeffer… Not … 10.2 35111 166239542
## 9 1073.1009 11.1 1073 33.5 -87.3 Alabama Jeffer… Not … 8.16 35444 385566685
## 10 1073.101 13.1 1073 33.5 -86.5 Alabama Jeffer… In a… 9.30 35094 148994881
## # ℹ 866 more rows
## # ℹ 39 more variables: zcta_pop <dbl>, imp_a500 <dbl>, imp_a1000 <dbl>,
## # imp_a5000 <dbl>, imp_a10000 <dbl>, imp_a15000 <dbl>, county_area <dbl>,
## # county_pop <dbl>, log_dist_to_prisec <dbl>, log_pri_length_5000 <dbl>,
## # log_pri_length_10000 <dbl>, log_pri_length_15000 <dbl>,
## # log_pri_length_25000 <dbl>, log_prisec_length_500 <dbl>,
## # log_prisec_length_1000 <dbl>, log_prisec_length_5000 <dbl>, …
set.seed(1234) # same seed as before
pm_split <- rsample::initial_split(data = pm, prop = 0.7)
pm_split## <Training/Testing/Total>
## <613/263/876>
## # 4-fold cross-validation
## # A tibble: 4 × 2
## splits id
## <list> <chr>
## 1 <split [459/154]> Fold1
## 2 <split [460/153]> Fold2
## 3 <split [460/153]> Fold3
## 4 <split [460/153]> Fold4
RF_rec <- recipe(train_pm) |>
update_role(everything(), new_role = "predictor")|>
update_role(value, new_role = "outcome")|>
update_role(id, new_role = "id variable") |>
update_role("fips", new_role = "county id") |>
step_novel("state") |>
step_string2factor("state", "county", "city") |>
step_rm("county") |>
step_rm("zcta") |>
step_corr(all_numeric())|>
step_nzv(all_numeric())RF_PM_model <- parsnip::rand_forest(mtry = 10, min_n = 3) |>
set_engine("randomForest") |>
set_mode("regression")
RF_PM_model## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 10
## min_n = 3
##
## Computational engine: randomForest
RF_wflow <- workflows::workflow() |>
workflows::add_recipe(RF_rec) |>
workflows::add_model(RF_PM_model)
RF_wflow## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_novel()
## • step_string2factor()
## • step_rm()
## • step_rm()
## • step_corr()
## • step_nzv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 10
## min_n = 3
##
## Computational engine: randomForest
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_novel()
## • step_string2factor()
## • step_rm()
## • step_rm()
## • step_corr()
## • step_nzv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~10, x), nodesize = min_rows(~3, x))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.662351
## % Var explained: 58.07
set.seed(456)
resample_RF_fit <- tune::fit_resamples(RF_wflow, vfold_pm)
collect_metrics(resample_RF_fit)## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.68 4 0.0676 Preprocessor1_Model1
## 2 rsq standard 0.579 4 0.0417 Preprocessor1_Model1
tune_RF_model <- rand_forest(mtry = tune(), min_n = tune()) |>
set_engine("randomForest") |>
set_mode("regression")
tune_RF_model## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## min_n = tune()
##
## Computational engine: randomForest
RF_tune_wflow <- workflows::workflow() |>
workflows::add_recipe(RF_rec) |>
workflows::add_model(tune_RF_model)
RF_tune_wflow## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_novel()
## • step_string2factor()
## • step_rm()
## • step_rm()
## • step_corr()
## • step_nzv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## min_n = tune()
##
## Computational engine: randomForest
## [1] 256
doParallel::registerDoParallel(cores = n_cores)
set.seed(123)
tune_RF_results <- tune_grid(object = RF_tune_wflow, resamples = vfold_pm, grid = 20)
tune_RF_results## # Tuning results
## # 4-fold cross-validation
## # A tibble: 4 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [459/154]> Fold1 <tibble [40 × 6]> <tibble [0 × 3]>
## 2 <split [460/153]> Fold2 <tibble [40 × 6]> <tibble [0 × 3]>
## 3 <split [460/153]> Fold3 <tibble [40 × 6]> <tibble [0 × 3]>
## 4 <split [460/153]> Fold4 <tibble [40 × 6]> <tibble [0 × 3]>
## # A tibble: 40 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 12 33 rmse standard 1.72 4 0.0802 Preprocessor1_Model01
## 2 12 33 rsq standard 0.551 4 0.0457 Preprocessor1_Model01
## 3 27 35 rmse standard 1.71 4 0.0773 Preprocessor1_Model02
## 4 27 35 rsq standard 0.542 4 0.0551 Preprocessor1_Model02
## 5 22 40 rmse standard 1.73 4 0.0705 Preprocessor1_Model03
## 6 22 40 rsq standard 0.537 4 0.0502 Preprocessor1_Model03
## 7 1 27 rmse standard 2.02 4 0.101 Preprocessor1_Model04
## 8 1 27 rsq standard 0.433 4 0.0645 Preprocessor1_Model04
## 9 6 32 rmse standard 1.78 4 0.0892 Preprocessor1_Model05
## 10 6 32 rsq standard 0.537 4 0.0494 Preprocessor1_Model05
## # ℹ 30 more rows
## # A tibble: 1 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 18 4 rmse standard 1.67 4 0.0687 Preprocessor1_Model17
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 18 4 Preprocessor1_Model17
# specify best combination from tune in workflow
RF_tuned_wflow <-RF_tune_wflow |>
tune::finalize_workflow(tuned_RF_values)
# fit model with those parameters on train AND test
overallfit <- RF_wflow |>
tune::last_fit(pm_split)
collect_metrics(overallfit)## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1.76 Preprocessor1_Model1
## 2 rsq standard 0.619 Preprocessor1_Model1
## # A tibble: 263 × 5
## .pred id .row value .config
## <dbl> <chr> <int> <dbl> <chr>
## 1 12.0 train/test split 3 11.2 Preprocessor1_Model1
## 2 11.7 train/test split 5 12.4 Preprocessor1_Model1
## 3 11.1 train/test split 6 10.5 Preprocessor1_Model1
## 4 13.1 train/test split 7 15.6 Preprocessor1_Model1
## 5 12.0 train/test split 8 12.4 Preprocessor1_Model1
## 6 10.8 train/test split 9 11.1 Preprocessor1_Model1
## 7 11.0 train/test split 16 10.0 Preprocessor1_Model1
## 8 11.5 train/test split 18 12.0 Preprocessor1_Model1
## 9 12.1 train/test split 20 13.2 Preprocessor1_Model1
## 10 11.2 train/test split 24 11.7 Preprocessor1_Model1
## # ℹ 253 more rows
PM model with climate regions
pm_clim <- pm_clim |>
mutate(city = case_when(city == "Not in a city" ~ "Not in a city",
city != "Not in a city" ~ "In a city"))
pm_clim## # A tibble: 876 × 51
## id value fips lat lon state county city CMAQ zcta zcta_area
## <fct> <dbl> <fct> <dbl> <dbl> <chr> <chr> <chr> <dbl> <fct> <dbl>
## 1 1003.001 9.60 1003 30.5 -87.9 Alabama Baldwin In a… 8.10 36532 190980522
## 2 1027.0001 10.8 1027 33.3 -85.8 Alabama Clay In a… 9.77 36251 374132430
## 3 1033.1002 11.2 1033 34.8 -87.7 Alabama Colbert In a… 9.40 35660 16716984
## 4 1049.1003 11.7 1049 34.3 -86.0 Alabama DeKalb In a… 8.53 35962 203836235
## 5 1055.001 12.4 1055 34.0 -86.0 Alabama Etowah In a… 9.24 35901 154069359
## 6 1069.0003 10.5 1069 31.2 -85.4 Alabama Houston In a… 9.12 36303 162685124
## 7 1073.0023 15.6 1073 33.6 -86.8 Alabama Jeffer… In a… 10.2 35207 26929603
## 8 1073.1005 12.4 1073 33.3 -87.0 Alabama Jeffer… Not … 10.2 35111 166239542
## 9 1073.1009 11.1 1073 33.5 -87.3 Alabama Jeffer… Not … 8.16 35444 385566685
## 10 1073.101 13.1 1073 33.5 -86.5 Alabama Jeffer… In a… 9.30 35094 148994881
## # ℹ 866 more rows
## # ℹ 40 more variables: zcta_pop <dbl>, imp_a500 <dbl>, imp_a1000 <dbl>,
## # imp_a5000 <dbl>, imp_a10000 <dbl>, imp_a15000 <dbl>, county_area <dbl>,
## # county_pop <dbl>, log_dist_to_prisec <dbl>, log_pri_length_5000 <dbl>,
## # log_pri_length_10000 <dbl>, log_pri_length_15000 <dbl>,
## # log_pri_length_25000 <dbl>, log_prisec_length_500 <dbl>,
## # log_prisec_length_1000 <dbl>, log_prisec_length_5000 <dbl>, …
set.seed(1234) # same seed as before
pm_split <- rsample::initial_split(data = pm_clim, prop = 0.7)
pm_split## <Training/Testing/Total>
## <613/263/876>
## # 5-fold cross-validation
## # A tibble: 5 × 2
## splits id
## <list> <chr>
## 1 <split [490/123]> Fold1
## 2 <split [490/123]> Fold2
## 3 <split [490/123]> Fold3
## 4 <split [491/122]> Fold4
## 5 <split [491/122]> Fold5
RF_rec <- recipe(train_pm) |>
update_role(everything(), new_role = "predictor")|>
update_role(value, new_role = "outcome")|>
update_role(id, new_role = "id variable") |>
update_role("fips", new_role = "county id") |>
step_novel("state") |>
step_string2factor("state", "county", "city", "climate_region") |>
step_rm("county") |>
step_rm("zcta") |>
step_corr(all_numeric())|>
step_nzv(all_numeric())RF_PM_model <- parsnip::rand_forest(mtry = 10, min_n = 3) |>
set_engine("randomForest") |>
set_mode("regression")
RF_PM_model## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 10
## min_n = 3
##
## Computational engine: randomForest
RF_wflow <- workflows::workflow() |>
workflows::add_recipe(RF_rec) |>
workflows::add_model(RF_PM_model)
RF_wflow## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_novel()
## • step_string2factor()
## • step_rm()
## • step_rm()
## • step_corr()
## • step_nzv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 10
## min_n = 3
##
## Computational engine: randomForest
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_novel()
## • step_string2factor()
## • step_rm()
## • step_rm()
## • step_corr()
## • step_nzv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~10, x), nodesize = min_rows(~3, x))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.613063
## % Var explained: 58.84
set.seed(456)
resample_RF_fit <- tune::fit_resamples(RF_wflow, vfold_pm)
collect_metrics(resample_RF_fit)## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 1.66 5 0.0982 Preprocessor1_Model1
## 2 rsq standard 0.582 5 0.0438 Preprocessor1_Model1
tune_RF_model <- rand_forest(mtry = tune(), min_n = tune()) |>
set_engine("randomForest") |>
set_mode("regression")
tune_RF_model## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## min_n = tune()
##
## Computational engine: randomForest
RF_tune_wflow <- workflows::workflow() |>
workflows::add_recipe(RF_rec) |>
workflows::add_model(tune_RF_model)
RF_tune_wflow## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_novel()
## • step_string2factor()
## • step_rm()
## • step_rm()
## • step_corr()
## • step_nzv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## min_n = tune()
##
## Computational engine: randomForest
## [1] 256
doParallel::registerDoParallel(cores = n_cores)
set.seed(123)
tune_RF_results <- tune_grid(object = RF_tune_wflow, resamples = vfold_pm, grid = 30)
tune_RF_results## # Tuning results
## # 5-fold cross-validation
## # A tibble: 5 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [490/123]> Fold1 <tibble [60 × 6]> <tibble [0 × 3]>
## 2 <split [490/123]> Fold2 <tibble [60 × 6]> <tibble [0 × 3]>
## 3 <split [490/123]> Fold3 <tibble [60 × 6]> <tibble [0 × 3]>
## 4 <split [491/122]> Fold4 <tibble [60 × 6]> <tibble [0 × 3]>
## 5 <split [491/122]> Fold5 <tibble [60 × 6]> <tibble [0 × 3]>
## # A tibble: 60 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 33 39 rmse standard 1.73 5 0.0992 Preprocessor1_Model01
## 2 33 39 rsq standard 0.529 5 0.0388 Preprocessor1_Model01
## 3 20 16 rmse standard 1.69 5 0.106 Preprocessor1_Model02
## 4 20 16 rsq standard 0.556 5 0.0441 Preprocessor1_Model02
## 5 16 34 rmse standard 1.71 5 0.104 Preprocessor1_Model03
## 6 16 34 rsq standard 0.547 5 0.0426 Preprocessor1_Model03
## 7 36 18 rmse standard 1.70 5 0.0977 Preprocessor1_Model04
## 8 36 18 rsq standard 0.546 5 0.0386 Preprocessor1_Model04
## 9 18 35 rmse standard 1.72 5 0.104 Preprocessor1_Model05
## 10 18 35 rsq standard 0.542 5 0.0422 Preprocessor1_Model05
## # ℹ 50 more rows
## # A tibble: 1 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 24 3 rmse standard 1.66 5 0.0943 Preprocessor1_Model21
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 24 3 Preprocessor1_Model21
# specify best combination from tune in workflow
RF_tuned_wflow <-RF_tune_wflow |>
tune::finalize_workflow(tuned_RF_values)
# fit model with those parameters on train AND test
overallfit <- RF_wflow |>
tune::last_fit(pm_split)
collect_metrics(overallfit)## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 1.80 Preprocessor1_Model1
## 2 rsq standard 0.583 Preprocessor1_Model1
## # A tibble: 263 × 5
## .pred id .row value .config
## <dbl> <chr> <int> <dbl> <chr>
## 1 11.9 train/test split 3 11.2 Preprocessor1_Model1
## 2 11.7 train/test split 5 12.4 Preprocessor1_Model1
## 3 11.1 train/test split 6 10.5 Preprocessor1_Model1
## 4 13.1 train/test split 7 15.6 Preprocessor1_Model1
## 5 12.1 train/test split 8 12.4 Preprocessor1_Model1
## 6 10.9 train/test split 9 11.1 Preprocessor1_Model1
## 7 11.1 train/test split 16 10.0 Preprocessor1_Model1
## 8 11.6 train/test split 18 12.0 Preprocessor1_Model1
## 9 11.9 train/test split 20 13.2 Preprocessor1_Model1
## 10 11.3 train/test split 24 11.7 Preprocessor1_Model1
## # ℹ 253 more rows
#split
set.seed(1234)
pm_split <- rsample::initial_split(data = pm, prop = 2/3)
#label the sections
train_pm <- rsample::training(pm_split)
test_pm <- rsample::testing(pm_split)
#recipe
RF_rec <- recipe(train_pm) |>
update_role(everything(), new_role = "predictor")|>
update_role(value, new_role = "outcome")|>
update_role(id, new_role = "id variable") |>
update_role("fips", new_role = "county id") |>
step_novel("state") |>
step_string2factor("state", "county", "city") |>
step_rm("county") |>
step_rm("zcta") |>
step_corr(all_numeric())|>
step_nzv(all_numeric())
# install.packages("randomForest")
RF_PM_model <- parsnip::rand_forest(mtry = 10, min_n = 3) |>
set_engine("randomForest") |>
set_mode("regression")
RF_PM_model## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 10
## min_n = 3
##
## Computational engine: randomForest
#Workflow code from lecture
RF_wflow <- workflows::workflow() |>
workflows::add_recipe(RF_rec) |>
workflows::add_model(RF_PM_model)
RF_wflow## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_novel()
## • step_string2factor()
## • step_rm()
## • step_rm()
## • step_corr()
## • step_nzv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 10
## min_n = 3
##
## Computational engine: randomForest
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_novel()
## • step_string2factor()
## • step_rm()
## • step_rm()
## • step_corr()
## • step_nzv()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## randomForest(x = maybe_data_frame(x), y = y, mtry = min_cols(~10, x), nodesize = min_rows(~3, x))
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 2.633327
## % Var explained: 59.29